{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "corpus = [\n",
    "    'UNC played Duke in basketball',\n",
    "    'Duke lost the basketball game'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1 1 0 1 0 1 0 1]\n",
      " [1 1 1 0 1 0 1 0]]\n",
      "{'played': 5, 'the': 6, 'in': 3, 'lost': 4, 'game': 2, 'basketball': 0, 'unc': 7, 'duke': 1}\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "vectorizer = CountVectorizer()\n",
    "print(vectorizer.fit_transform(corpus).todense())\n",
    "print(vectorizer.vocabulary_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0 1 1 0 1 0 1 0 0 1]\n",
      " [0 1 1 1 0 1 0 0 1 0]\n",
      " [1 0 0 0 0 0 0 1 0 0]]\n",
      "{'played': 6, 'the': 8, 'in': 4, 'game': 3, 'lost': 5, 'ate': 0, 'sandwich': 7, 'basketball': 1, 'unc': 9, 'duke': 2}\n"
     ]
    }
   ],
   "source": [
    "corpus.append('I ate a sandwich')\n",
    "print(vectorizer.fit_transform(corpus).todense())\n",
    "print(vectorizer.vocabulary_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Distance between 1st and 2nd documents: [[ 2.44948974]]\n",
      "Distance between 1st and 3rd documents: [[ 2.64575131]]\n",
      "Distance between 2nd and 3rd documents: [[ 2.64575131]]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics.pairwise import euclidean_distances\n",
    "X = vectorizer.fit_transform(corpus).todense()\n",
    "print('Distance between 1st and 2nd documents:', euclidean_distances(X[0], X[1]))\n",
    "print('Distance between 1st and 3rd documents:', euclidean_distances(X[0], X[2]))\n",
    "print('Distance between 2nd and 3rd documents:', euclidean_distances(X[1], X[2]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0 1 1 0 0 1 0 1]\n",
      " [0 1 1 1 1 0 0 0]\n",
      " [1 0 0 0 0 0 1 0]]\n",
      "{'played': 5, 'game': 3, 'lost': 4, 'ate': 0, 'sandwich': 6, 'basketball': 1, 'unc': 7, 'duke': 2}\n"
     ]
    }
   ],
   "source": [
    "vectorizer = CountVectorizer(stop_words='english')\n",
    "print(vectorizer.fit_transform(corpus).todense())\n",
    "print(vectorizer.vocabulary_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1 0 0 1]\n",
      " [0 1 1 0]]\n",
      "{'ate': 0, 'eaten': 1, 'sandwich': 2, 'sandwiches': 3}\n"
     ]
    }
   ],
   "source": [
    "corpus = [\n",
    "    'He ate the sandwiches',\n",
    "    'Every sandwich was eaten by him'\n",
    "]\n",
    "vectorizer = CountVectorizer(binary=True, stop_words='english')\n",
    "print(vectorizer.fit_transform(corpus).todense())\n",
    "print(vectorizer.vocabulary_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "corpus = [\n",
    "    'I am gathering ingredients for the sandwich.',\n",
    "    'There were many wizards at the gathering.'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gather\n",
      "gathering\n"
     ]
    }
   ],
   "source": [
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "\n",
    "lemmatizer = WordNetLemmatizer()\n",
    "print(lemmatizer.lemmatize('gathering', 'v'))\n",
    "print(lemmatizer.lemmatize('gathering', 'n'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gather\n"
     ]
    }
   ],
   "source": [
    "from nltk.stem import PorterStemmer\n",
    "\n",
    "stemmer = PorterStemmer()\n",
    "print(stemmer.stem('gathering'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
